library(tidyverse); theme_set(theme_bw())
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggpubr)
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:ggpubr':
## 
##     get_legend
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(ggbeeswarm)
library(fs)
# Get input files
kmer_QC_file <- "/labs/kingsley/ambenj/myosin_dups/analysis/ase/kmer_analysis/MYH_BEPA_RABS_27mers_QC.txt"
m_file <- "/labs/kingsley/ambenj/myosin_dups/analysis/ase/metadata.txt"
seq_stats_file <- "/labs/kingsley/ambenj/myosin_dups/raw/RNAseq/30-1151453543/sequencing_stats.csv"
kinnex_counts_file <- "/labs/kingsley/ambenj/myosin_dups/analysis/ase/flnc_mapping/align_RABS-3copy_BEPA-5copy/exons3-35_counts/exon3-35_counts_all.txt"
# Check a reads detected by multiple kmers are assigned to the same myosin copy and allele
r <- read_tsv("/labs/kingsley/ambenj/myosin_dups/analysis/ase/kmer_analysis/kmer_counts/9_matching_reads.txt", col_names = c("kmer", "read")) %>% 
  group_by(read) %>% 
  summarize(n = n(),
         kmers = paste(sort(unique(kmer)),collapse=", ")) %>% 
  filter(n>1) %>% 
  mutate(C1 = ifelse(str_detect(kmers, "C1"), 1, 0),
         C2 = ifelse(str_detect(kmers, "C2"), 1, 0),
         C3 = ifelse(str_detect(kmers, "C3"), 1, 0),
         BEPA = ifelse(str_detect(kmers, "BEPA"), 1, 0),
         RABS = ifelse(str_detect(kmers, "RABS"), 1, 0),
         MYH_copies_detect = C1 + C2 + C3,
         alleles_detected = BEPA + RABS)
## Rows: 227291 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): kmer, read
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Count number of reads assigned to more than one MYH copy
r %>% 
  filter(MYH_copies_detect > 1) 
# Are there any kmers assigned to more than one MYH copy that are more common than others?
r %>% 
  filter(MYH_copies_detect > 1) %>% 
  group_by(kmers) %>% 
  summarise(n=n()) %>% 
  arrange(-n)
# Count number of reads detected by both BEPA and RABS alleles
r %>% 
  filter(alleles_detected > 1, !str_detect(kmers, "C3dup"))
# Read metadata file
m <- read_tsv(m_file) %>% 
  mutate(Tissue_type = case_when(Tissue_type == "Whole larvae" ~ "Whole juvenile",
                                 Tissue_type == "Flank muscle" ~ "Adult flank",
                                 Tissue_type == "Abductor muscle" ~ "Adult abductor",
                                 Tissue_type == "Adductor muscle" ~ "Adult adductor",
                                 TRUE ~ Tissue_type),
    Tissue_type = factor(Tissue_type, levels=c("Whole juvenile", "Adult flank", "Adult abductor", "Adult adductor")))
## Rows: 45 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (10): ID, Fish, Sample1, Sample2, Type, Age, Temperature, Tissue_type, C...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
m
# Read sequencing stats file
s <- read_csv(seq_stats_file)
## Rows: 45 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Project, Sample ID, Barcode Sequence
## dbl (4): # Reads, Yield (Mbases), Mean Quality Score, % Bases >= 30
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
s
# Read kmer information
k <- read_tsv(kmer_QC_file)
## Rows: 102 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (15): kmer_name, BLAT_QC_BEPA, BLAT_QC_RABS, Isoseq_QC, Sufficient cover...
## dbl  (5): length, min, max, min_original, max_original
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Get list of kmer files per sample
kmer_files <- dir_ls(path = "/labs/kingsley/ambenj/myosin_dups/analysis/ase/kmer_analysis/kmer_counts", glob = "*_kmers_with_counts.tsv")

# function to add file name to dataframe
read_and_record_filename <- function(filename){
  read_tsv(filename) %>%
  mutate(filename = path_file(filename))
  }

# gather kmer files from all samples into one dataframe
kmer_counts <- map_df(kmer_files, read_and_record_filename)%>% 
  separate(filename, into = c("sample"), sep = "_") %>% 
  left_join(.,k, by=c("kmer_name"="kmer_name")) %>% 
  mutate(group = case_when(str_detect(kmer_name, "C3dup") ~ "C3dup_ASE",
                           str_detect(kmer_name, "ASE") ~ "ASE",
                           str_detect(kmer_name, "MCE") ~ "MCE"))
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 102 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): kmer_name, kmer, kmer_rc
## dbl (1): match_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 1 pieces. Additional pieces discarded in 4590 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
kmer_counts

Allele-specific expression analysis

# Read kmer counts file
df_ase <- kmer_counts %>% 
  filter(group == "ASE", use=="yes", spans_intron == "no") %>% 
  separate(kmer_name, into=c("source", "ASE"), remove=FALSE, extra="drop", sep="_") %>% 
  unite(kmer_pair, source, ASE, sep = "_", remove = FALSE) %>% 
  select(source, kmer_pair, use, exon, spans_intron, sample, allele, match_count, min) %>% 
  pivot_wider(id_cols = c(kmer_pair, use, sample, source, exon, spans_intron, min), names_from = allele, values_from = match_count) %>% 
  mutate(total_counts = BEPA + RABS,
         include = case_when(BEPA >=1 & RABS >= 1 & total_counts >= 100 ~ "yes",
                             TRUE ~ "no"),
#         BEPA = case_when(BEPA==0 ~ 1, # Add minimum of one read to counts so log scores wont be zero
#                          TRUE ~ BEPA),
#         RABS = case_when(RABS==0 ~ 1,
#                          TRUE ~ RABS),
    BEPA_RABS_ratio = BEPA / RABS,
    BEPA_RABS_log2_ratio = log2(BEPA/RABS),
    prop_BEPA = BEPA/(BEPA+RABS)) %>% 
  left_join(., m, by=c("sample" = "ID")) %>% 
  left_join(., s, by=c("sample" = "Sample ID")) %>% 
  mutate(RPM = (total_counts/ `# Reads`)*1000000) # Table contains read pairs 

df_ase
df_ase %>% 
  group_by(kmer_pair) %>% 
  summarize(n=n()) %>% 
  arrange()
df_ase %>% 
  filter(kmer_pair == "C1_ASE2") %>% 
  group_by(Tissue_type, Temperature) %>% 
  summarize(n=n())
## `summarise()` has grouped output by 'Tissue_type'. You can override using the
## `.groups` argument.
# Perform two-sided, one sample Wilcoxon test (safer to use with small n=3-10) to determine if sample is significantly greater than 0
test_results <- df_ase %>%
  group_by(kmer_pair, Temperature, source, Tissue_type) %>%
  filter(include == "yes") %>%
  summarize(
    n = sum(!is.na(BEPA_RABS_log2_ratio)),
    p_value = if (n >= 3) wilcox.test(prop_BEPA, mu = 0.5)$p.value else NA_real_,
#    p_value = if (n >= 3) wilcox.test(BEPA_RABS_log2_ratio, mu = 0, alternative = "greater")$p.value else NA_real_,
    .groups = "drop"
  ) %>%
  mutate(
    significance = case_when(
      is.na(p_value)     ~ NA_character_,
      p_value <= 0.001   ~ "***",
      p_value <= 0.01    ~ "**",
      p_value <= 0.05    ~ "*",
      TRUE               ~ "ns"
    )
  )

# Add y-label positions for plotting significance results
label_positions <- df_ase %>%
  filter(include == "yes") %>%
  group_by(kmer_pair,Temperature, source, Tissue_type) %>%
  summarize(y_pos = max(prop_BEPA, na.rm = TRUE) + 0.05, .groups = "drop")

test_results <- left_join(test_results, label_positions,
                          by = c("kmer_pair", "Temperature", "source", "Tissue_type"))


# Plot with significance values
ASE_plot <- df_ase %>%
  filter(include == "yes") %>%
  ggplot(aes(reorder(kmer_pair, min), prop_BEPA, color = Temperature)) +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  geom_boxplot(fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=1, dodge.width = .7, alpha = 0.7, size=2) +
  scale_color_manual(values = c("black", "grey60")) +
  facet_grid(cols = vars(source), rows = vars(Tissue_type), scales = "free_x", space = "free") +
  geom_text(
    data = test_results %>% filter(!is.na(significance)),
    aes(x = kmer_pair, y = y_pos, label = significance, group = Temperature),
    position = position_dodge(width = 0.8),
    inherit.aes = FALSE,
    size = 4) +
  stat_compare_means(method = "wilcox.test", label = "p.signif", vjust=0.75) +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ylim(0.2, 0.9)

ASE_plot

# Save ASE figure to separate file
ggsave(
  "/labs/kingsley/ambenj/myosin_dups/analysis/ase/figures/ASE_plot_twosided.pdf",
  plot = ASE_plot,
  scale = 1,
  width = 12,
  height = 10,
  units = c("in"),
  dpi = 300,
)
# Perform two-sided, one sample Wilcoxon test (safer to use with small n=3-10) to determine if sample is significantly greater than 0
test_results <- df_ase %>%
  group_by(kmer_pair, Temperature, source, Tissue_type) %>%
  filter(include == "yes") %>%
  summarize(
    n = sum(!is.na(BEPA_RABS_log2_ratio)),
    p_value = if (n >= 3) wilcox.test(BEPA_RABS_log2_ratio, mu = 0, alternative = "greater")$p.value else NA_real_,
    .groups = "drop"
  ) %>%
  mutate(
    significance = case_when(
      is.na(p_value)     ~ NA_character_,
      p_value <= 0.001   ~ "***",
      p_value <= 0.01    ~ "**",
      p_value <= 0.05    ~ "*",
      TRUE               ~ "ns"
    )
  )

# Add y-label positions for plotting significance results
label_positions <- df_ase %>%
  filter(include == "yes") %>%
  group_by(kmer_pair,Temperature, source, Tissue_type) %>%
  summarize(y_pos = max(BEPA_RABS_log2_ratio, na.rm = TRUE) + 0.3, .groups = "drop")

test_results <- left_join(test_results, label_positions,
                          by = c("kmer_pair", "Temperature", "source", "Tissue_type"))


# Plot with significance values
df_ase %>%
  filter(include == "yes") %>%
  ggplot(aes(reorder(kmer_pair, min), BEPA_RABS_log2_ratio, color = Temperature)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_boxplot(fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=1, dodge.width = .7, alpha = 0.7, size=2) +
  scale_color_manual(values = c("black", "grey60")) +
  facet_grid(cols = vars(source), rows = vars(Tissue_type), scales = "free_x", space = "free") +
  geom_text(
    data = test_results %>% filter(!is.na(significance)),
    aes(x = kmer_pair, y = y_pos, label = significance, group = Temperature),
    position = position_dodge(width = 0.8),
    inherit.aes = FALSE,
    size = 4) +
  stat_compare_means(method = "wilcox.test", label = "p.signif", vjust=0.75) +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ylim(-1.5, 3.1)

# Collapse data for each kmer into mean across kmers per condition
means <- df_ase %>%
  filter(include == "yes") %>% 
#  filter(include == "yes", source == "C3" | (source=="C2" & Tissue_type == "Whole juvenile")) %>%
  group_by(source, Tissue_type, Temperature, sample) %>% 
  summarize(n = n(),
            mean_BEPA_RABS_log2_ratio = mean(BEPA_RABS_log2_ratio),
            geom_mean_prop_BEPA = exp(mean(log(prop_BEPA))),
            mean_prop_BEPA = mean(prop_BEPA),
            BEPA_total = sum(BEPA),
            RABS_total = sum(RABS),
            total_prop_BEPA = BEPA_total/ (BEPA_total + RABS_total)) %>% 
  filter((source == "C2" & n==4)|(source=="C3" & n==9)) # Require all kmers to be called
## `summarise()` has grouped output by 'source', 'Tissue_type', 'Temperature'. You
## can override using the `.groups` argument.
means
# Perform two-sided, one sample Wilcoxon test (safer to use with small n=3-10) to determine if sample is significantly greater than 0
test_results_mean <- means %>%
  group_by(Temperature, source, Tissue_type) %>%
  summarize(
    n = sum(!is.na(mean_prop_BEPA)),
    p_value = if (n >= 3) wilcox.test(mean_prop_BEPA, mu = 0.5)$p.value else NA_real_,
    .groups = "drop"
  ) %>%
  mutate(
    significance = case_when(
      is.na(p_value)     ~ NA_character_,
      p_value <= 0.001   ~ "***",
      p_value <= 0.01    ~ "**",
      p_value <= 0.05    ~ "*",
      TRUE               ~ "ns"
    )
  )

# Add y-label positions for plotting significance results
label_positions_means <- means %>%
  group_by(Temperature, source, Tissue_type) %>%
  summarize(y_pos = max(mean_prop_BEPA, na.rm = TRUE) + 0.015, .groups = "drop")


test_results_means <- left_join(test_results_mean, label_positions_means,
                          by = c("Temperature", "source", "Tissue_type"))

# Plot collapsed ASE results using arithmetic mean of proportion BEPA
ase_means_temp_plot <- means %>% 
  ggplot(aes(Tissue_type, mean_prop_BEPA, color = Temperature)) +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  geom_boxplot(fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=2) +
  scale_color_manual(values = c("black", "grey60")) +
  scale_y_continuous(labels = percent, limits=c(0.45, 0.75)) +
  facet_grid(cols = vars(source), scales = "free_x", space = "free") +
  geom_text(
    data = test_results_means %>% filter(!is.na(significance)),
    aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
    position = position_dodge(width = 0.8),
    inherit.aes = FALSE,
    size = 4) +
  stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.5) +
  theme_cowplot(7) +
  panel_border(color="black", size=0.75) +
  scale_x_discrete(labels = label_wrap(10)) +
  theme(legend.position = "top") +
#  ylim(0.45, 0.75) +
  xlab("Tissue") +
  ylab("Percent BEPA allele expression")
ase_means_temp_plot

# Plot collapsed ASE results using arithmetic mean of kmers
ase_means_plot <- means %>% 
    filter(Temperature == "18C") %>% 
  ggplot(aes(Tissue_type, mean_prop_BEPA, color = source, fill=source)) +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  geom_boxplot(fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=2, shape=21) +
  scale_fill_manual(values = c("#f0e442", "#009e73")) +
  scale_color_manual(values = c("#c6ba10", "#009e73")) +
  scale_y_continuous(labels = percent) +
  facet_grid(cols = vars(source), scales = "free_x", space = "free") +
  geom_text(
    data = test_results_means %>% filter(!is.na(significance), Temperature == "18C"),
    aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
    position = position_dodge(width = 0.8),
    inherit.aes = FALSE,
    size = 4) +
  theme_cowplot(7) +
  panel_border(color="black", size=0.75) +
  scale_x_discrete(labels = label_wrap(10)) +
  theme(legend.position = "none") +
  xlab("Tissue") +
  ylab("Percent BEPA allele expression")

ase_means_plot

# Perform two-sided, one sample Wilcoxon test (safer to use with small n=3-10) to determine if sample is significantly greater than 0
test_results_mean <- means %>%
  group_by(Temperature, source, Tissue_type) %>%
  summarize(
    n = sum(!is.na(geom_mean_prop_BEPA)),
    p_value = if (n >= 3) wilcox.test(geom_mean_prop_BEPA, mu = 0.5)$p.value else NA_real_,
    .groups = "drop"
  ) %>%
  mutate(
    significance = case_when(
      is.na(p_value)     ~ NA_character_,
      p_value <= 0.001   ~ "***",
      p_value <= 0.01    ~ "**",
      p_value <= 0.05    ~ "*",
      TRUE               ~ "ns"
    )
  )

# Add y-label positions for plotting significance results
label_positions_means <- means %>%
  group_by(Temperature, source, Tissue_type) %>%
  summarize(y_pos = max(geom_mean_prop_BEPA, na.rm = TRUE) + 0.015, .groups = "drop")


test_results_means <- left_join(test_results_mean, label_positions_means,
                          by = c("Temperature", "source", "Tissue_type"))

# Plot collapsed ASE results using geometric mean of kmers
means %>% 
  ggplot(aes(Tissue_type, geom_mean_prop_BEPA, color = Temperature)) +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  geom_boxplot(fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3) +
  scale_color_manual(values = c("black", "grey60")) +
  facet_grid(cols = vars(source), scales = "free_x", space = "free") +
  geom_text(
    data = test_results_means %>% filter(!is.na(significance)),
    aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
    position = position_dodge(width = 0.8),
    inherit.aes = FALSE,
    size = 4) +
  stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.5) +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  scale_x_discrete(labels = label_wrap(10)) +
#  ylim(0.18,0.82) +
  xlab("Tissue type") +
  ylab("Proportion of expression from BEPA allele")

# Plot collapsed ASE results using geometric mean of kmers
means %>% 
    filter(Temperature == "18C") %>% 
  ggplot(aes(Tissue_type, geom_mean_prop_BEPA, color = source, fill=source)) +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  geom_boxplot(fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3, shape=21) +
  scale_fill_manual(values = c("#f0e442", "#009e73")) +
  scale_color_manual(values = c("#c6ba10", "#009e73")) +
  facet_grid(cols = vars(source), scales = "free_x", space = "free") +
  geom_text(
    data = test_results_means %>% filter(!is.na(significance), Temperature == "18C"),
    aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
    position = position_dodge(width = 0.8),
    inherit.aes = FALSE,
    size = 4) +
  theme_cowplot(6) +
  panel_border(color="black", size=0.75) +
  scale_x_discrete(labels = label_wrap(10)) +
  theme(legend.position = "none") +
#  ylim(0.18,0.82) +
  xlab("Tissue type") +
  ylab("Proportion of expression from BEPA allele")

# Perform two-sided, one sample Wilcoxon test (safer to use with small n=3-10) to determine if sample is significantly greater than 0
test_results_mean <- means %>%
  group_by(Temperature, source, Tissue_type) %>%
  summarize(
    n = sum(!is.na(mean_BEPA_RABS_log2_ratio)),
    p_value = if (n >= 3) wilcox.test(mean_BEPA_RABS_log2_ratio, mu = 0)$p.value else NA_real_,
    .groups = "drop"
  ) %>%
  mutate(
    significance = case_when(
      is.na(p_value)     ~ NA_character_,
      p_value <= 0.001   ~ "***",
      p_value <= 0.01    ~ "**",
      p_value <= 0.05    ~ "*",
      TRUE               ~ "ns"
    )
  )

# Add y-label positions for plotting significance results
label_positions_means <- means %>%
  group_by(Temperature, source, Tissue_type) %>%
#  summarize(y_pos = max(mean_BEPA_RABS_log2_ratio, na.rm = TRUE) + 0.03, .groups = "drop")
  summarize(y_pos = max(mean_BEPA_RABS_log2_ratio, na.rm = TRUE) + 0.1, .groups = "drop")


test_results_means <- left_join(test_results_mean, label_positions_means,
                          by = c("Temperature", "source", "Tissue_type"))

# Plot collapsed ASE results
means %>% 
  ggplot(aes(Tissue_type, mean_BEPA_RABS_log2_ratio, color = Temperature)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_boxplot(fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3) +
  scale_color_manual(values = c("black", "grey60")) +
  facet_grid(cols = vars(source), scales = "free_x", space = "free") +
  geom_text(
    data = test_results_means %>% filter(!is.na(significance)),
    aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
    position = position_dodge(width = 0.8),
    inherit.aes = FALSE,
    size = 4) +
  stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.5) +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  scale_x_discrete(labels = label_wrap(10)) +
#  ylim(0.18,0.82) +
#  ylim(-2.1, 2.1) +
  xlab("Tissue type") +
  ylab("log2(BEPA/RABS)")

# Plot collapsed ASE results for just 18C
means %>% 
  filter(Temperature == "18C") %>% 
  ggplot(aes(Tissue_type, mean_BEPA_RABS_log2_ratio, color = source, fill=source)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_boxplot(fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3, shape=21) +
  scale_fill_manual(values = c("#f0e442", "#009e73")) +
  scale_color_manual(values = c("#c6ba10", "#009e73")) +
  facet_grid(cols = vars(source), scales = "free_x", space = "free") +
  geom_text(
    data = test_results_means %>% filter(!is.na(significance), Temperature =="18C"),
    aes(x = Tissue_type, y = y_pos, label = significance, group = Temperature),
    position = position_dodge(width = 0.8),
    inherit.aes = FALSE,
    size = 4) +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  scale_x_discrete(labels = label_wrap(10)) +
#  ylim(0.18,0.82) +
  ylim(-0.5, 1.5) +
  xlab("Tissue type") +
  ylab("log2(BEPA/RABS)")

comps <- list(c("Whole juvenile", "Adult flank"), c("Adult flank", "Adult abductor"), c("Adult abductor", "Adult adductor"),
              c("Whole juvenile", "Adult abductor"), c("Whole juvenile", "Adult adductor"), c("Adult flank", "Adult adductor"))
# Plot just 18C
means %>% 
  filter(Temperature == "18C") %>% 
  ggplot(aes(Tissue_type, mean_prop_BEPA)) +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  geom_boxplot(fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3) +
    stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.5, comparisons = comps) +
  facet_grid(cols = vars(source), scales = "free_x", space = "free")
## Warning: Computation failed in `stat_signif()`.
## Caused by error in `wilcox.test.default()`:
## ! not enough 'y' observations

# Plot just 18C
means %>% 
  filter(Temperature == "18C", Tissue_type == "Whole juvenile") %>% 
  ggplot(aes(source, mean_prop_BEPA)) +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  geom_boxplot(fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=2, dodge.width = .7, alpha = 0.7, size=3) +
    stat_compare_means(method = "wilcox.test", vjust = 0.5)

# Plot total counts normalized to number of reads sequenced in each library
df_ase %>% 
  ggplot(aes(kmer_pair, RPM, color= Temperature)) +
  geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2) +
  geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
  scale_color_manual(values = c("black", "grey60")) +
  facet_grid(cols=vars(source), rows = vars(Tissue_type), scales = "free_x", space= "free") +
  stat_compare_means(method = "wilcox.test", label = "p.signif") +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Plot total counts (log10 scale) normalized to number of reads sequenced in each library 
total_expression <- df_ase %>% 
  ggplot(aes(kmer_pair, log10(RPM+1), color= Temperature)) +
  geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2) +
  geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
  scale_color_manual(values = c("black", "grey60")) +
  facet_grid(cols=vars(source), rows = vars(Tissue_type), scales = "free_x", space= "free") +
  stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.75) +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ylim(0, 4)

total_expression

# Save ASE figure to separate file
ggsave(
  "/labs/kingsley/ambenj/myosin_dups/analysis/ase/figures/totalexpressionASE_log10RPM_plot.pdf",
  plot = total_expression,
  scale = 1,
  width = 12,
  height = 10,
  units = c("in"),
  dpi = 300,
)
total_counts_means <- df_ase %>%
  group_by(source, Tissue_type, Temperature, sample) %>% 
  summarize(mean_log10RPM = mean(log10(RPM+1)))
## `summarise()` has grouped output by 'source', 'Tissue_type', 'Temperature'. You
## can override using the `.groups` argument.
total_counts_means %>% 
  ggplot(aes(Tissue_type, mean_log10RPM, color = Temperature)) +
  geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2) +
  geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
  scale_color_manual(values = c("black", "grey60")) +
  facet_grid(cols = vars(source), scales = "free_x", space = "free") +
  stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.25) +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  scale_x_discrete(labels = label_wrap(10)) +
  xlab("Tissue type") +
  ylab("log10(RPM+1)")

Total count analysis from myosin copy expression (MCE) kmers

# Read kmer counts file and get MCE kmers
df_mce <- kmer_counts %>% 
  filter(group == "MCE", use=="yes", spans_intron=="no") %>% 
  separate(kmer_name, into=c("kmer_group"), remove=FALSE, extra="drop", sep="_") %>% 
  left_join(., m, by=c("sample" = "ID")) %>% 
  left_join(., s, by=c("sample" = "Sample ID")) %>% 
  mutate(RPM = (match_count/ `# Reads`)*1000000) # Table contains read pairs 
# Plot total counts (log10 scale) normalized to number of reads sequenced in each library 
mce_plot <- df_mce %>% 
  ggplot(aes(kmer_group, log10(RPM+1), color= Temperature)) +
  geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2) +
  geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
  scale_color_manual(values = c("black", "grey60")) +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  facet_grid(cols=vars(source), rows = vars(Tissue_type), scales = "free_x", space= "free") +
  stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.75) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ylim(0, 3.8)

mce_plot

# Plot total counts (log10 scale) normalized to number of reads sequenced in each library 
df_mce %>% 
  filter(Temperature == "18C") %>% 
  ggplot(aes(kmer_group, log10(RPM+1), color= source, fill=source)) +
  geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2, shape=21) +
  geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
  scale_fill_manual(values = c("#e69f00", "#f0e442", "#009e73")) +
  scale_color_manual(values = c("#e69f00", "#c6ba10", "#009e73")) +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  facet_grid(cols=vars(source), rows = vars(Tissue_type), scales = "free_x", space= "free") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ylim(0, 3.8)

# Average kmers
mce_means <- df_mce %>%
  group_by(source, Tissue_type, Temperature, sample) %>% 
  summarize(mean_log10RPM = mean(log10(RPM+1)))
## `summarise()` has grouped output by 'source', 'Tissue_type', 'Temperature'. You
## can override using the `.groups` argument.
mce_means_temp_plot <- mce_means %>% 
  ggplot(aes(Tissue_type, mean_log10RPM, color = Temperature)) +
  geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=2) +
  geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
  facet_grid(cols = vars(source), scales = "free_x", space = "free") +
  stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.25) +
  scale_color_manual(values = c("black", "grey60")) +
  theme_cowplot(7) +
  theme(legend.position = "top") +
  panel_border(color="black", size=0.75) +
  scale_x_discrete(labels = label_wrap(10)) +
  ylim(0,3.1) +
  xlab("Tissue") +
  ylab("Total expression, log10(RPM+1)")
mce_means_temp_plot

mce_means %>% 
  filter(Temperature == "18C") %>% 
  ggplot(aes(Tissue_type, mean_log10RPM, fill = source, color=source)) +
  geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.7, size=3, shape=21) +
#  facet_grid(cols = vars(source), scales = "free_x", space = "free") +
#  stat_compare_means(method = "wilcox.test", label = "p.signif", vjust = 0.25) +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  scale_fill_manual(values = c("#e69f00", "#f0e442", "#009e73")) +
  scale_color_manual(values = c("#e69f00", "#c6ba10", "#009e73")) +
  scale_x_discrete(labels = label_wrap(10)) +
  ylim(0,3) +
  xlab("Tissue type") +
  ylab("log10(RPM+1)")

mce_means_plot <- mce_means %>% 
  filter(Temperature == "18C") %>% 
  ggplot(aes(Tissue_type, mean_log10RPM, color=source, fill=source)) +
  geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.7, size=2, shape=21) +
  facet_grid(cols = vars(source), scales = "free_x", space = "free") +
  scale_fill_manual(values = c("#e69f00", "#f0e442", "#009e73")) +
  scale_color_manual(values = c("#e69f00", "#c6ba10", "#009e73")) +
  theme_cowplot(7) +
  panel_border(color="black", size=0.75) +
  scale_x_discrete(labels = label_wrap(10)) +
  theme(legend.position = "none") +
  ylim(0,3) +
  xlab("Tissue") +
  ylab("Total expression, log10(RPM+1)")
mce_means_plot

mce_means %>% 
  filter(Temperature == "18C") %>% 
  ggplot(aes(Tissue_type, mean_log10RPM)) +
  geom_boxplot(position = position_dodge(width = 0.8), fill = NA, outlier.shape = NA) +
  geom_beeswarm(cex=1.2, dodge.width = .7, alpha = 0.5, size=3) +
  facet_grid(cols = vars(source), scales = "free_x", space = "free") +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  scale_x_discrete(labels = label_wrap(10)) +
  ylim(0,3) +
  xlab("Tissue type") +
  ylab("log10(RPM+1)")

# # Get count ratios for kmer that can distingish BEPA copies as well as RABS
# df_wide_C3_copies <- df %>% 
#     filter(kmer %in% c("CTAGAGGGAGCAATGAAAGAGGCTCGTTCT", "CTAGAGGGAGCATTGAAAGAGGCTCGTTCT", "CTTGAGGGAGCATTGAAGGAGGCTCGTTCT")) %>% 
#   select(source, exon, sample, allele, total_ASEcounts) %>% 
#   pivot_wider(id_cols = c(source, exon, sample), names_from = allele, values_from = total_ASEcounts) %>% 
#   mutate(total_counts = `BEPA-ab` + `BEPA-c` + RABS) %>% 
#   mutate(BEPA_RABS_ratio = (`BEPA-ab` + `BEPA-c`) / RABS)
#   
# df_wide_C3_copies
# df %>% 
#     filter(kmer %in% c("CTAGAGGGAGCAATGAAAGAGGCTCGTTCT", "CTAGAGGGAGCATTGAAAGAGGCTCGTTCT", "CTTGAGGGAGCATTGAAGGAGGCTCGTTCT")) %>% 
#   left_join(., m, by=c("sample" = "ID")) %>% 
#   ggplot(aes(sample, total_ASEcounts, fill=factor(allele, levels=c("RABS", "BEPA-ab", "BEPA-c")))) +
#   geom_bar(position="fill", stat="identity") +
#   scale_fill_manual(values = c("#D16D6A", "#4B77D1", "#CCDAF5"), name = "C3 copy") +
#   geom_hline(yintercept = 0.5, linetype = "dashed") +
#   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
#   ylab("Proportion of expression") +
#   facet_grid(cols=vars(Tissue_type, Temperature), scales = "free_x", space= "free")

Kinnex data analysis

k <- read_tsv(kinnex_counts_file) %>% 
  separate(sample, into=c("ID"), sep="_") %>% 
  separate(name, into=c("allele", "MYH_copy"), sep="_", remove=FALSE) %>% 
  mutate(allele=case_when(MYH_copy=="C3A" ~ "BEPA-C3A",
                          MYH_copy=="C3B" ~ "BEPA-C3B",
                          MYH_copy=="C3L" ~ "BEPA-C3L",
                          MYH_copy=="C3" ~ "RABS-C3",
                          TRUE ~ allele),
         allele = factor(allele, levels=c("RABS", "BEPA", "RABS-C3", "BEPA-C3A", "BEPA-C3B", "BEPA-C3L"))) %>% 
  left_join(., m)
## Rows: 128 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): name, chr, sample
## dbl (3): start, stop, read_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 1 pieces. Additional pieces discarded in 128 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## Warning: Expected 2 pieces. Additional pieces discarded in 128 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## Joining with `by = join_by(ID)`
k %>% 
  filter(str_detect(MYH_copy, "C3")) %>% 
  ggplot(aes(ID, read_count, fill=allele)) +
  geom_bar(position="fill", stat="identity", color="black") +
  scale_fill_manual(values = c("#d73027", "#0072b2", "#56afe1","#c7e4f5"), name = "MYH3 copy") +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ylab("Proportion of expression") +
  facet_grid(cols=vars(Tissue_type, Temperature), scales = "free_x", space= "free")

k_C3_temp_plot <- k %>% 
  filter(str_detect(MYH_copy, "C3")) %>% 
  ggplot(aes(ID, read_count, fill=allele)) +
  geom_bar(position="fill", stat="identity", color="black") +
  scale_fill_manual(values = c("#d73027", "#0072b2", "#56afe1","#c7e4f5"), name = "MYH3 copy") +
 # scale_fill_manual(values = c("#D16D6A", "#4B77D1", "#789DE6","#CCDAF5"), name = "MYH3 copy") +
  scale_y_continuous(expand = expansion(mult = c(0, 0)),
                     breaks = seq(0, 1, by = 0.1),
                     labels = percent) +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  theme_cowplot(7) +
#  panel_border(color="black", size=0.75) +
  theme(axis.text.x = element_blank(),
        axis.ticks.x=element_blank(),
        strip.background = element_rect(colour=NA, fill=NA),
        strip.placement = "outside",
        legend.position = "none") +
  ylab("Percent of total C3 expression") +
  xlab("Tissue") +
  facet_grid(cols=vars(Tissue_type, Temperature), scales = "free_x", space= "free",
             labeller = label_wrap_gen(width = 2, multi_line = TRUE),
             switch = "x")
k_C3_temp_plot

k_C3_plot <- k %>% 
  filter(str_detect(MYH_copy, "C3"), Temperature=="18C") %>% 
  ggplot(aes(ID, read_count, fill=allele)) +
  geom_bar(position="fill", stat="identity", color="black") +
  scale_fill_manual(values = c("#d73027", "#0072b2", "#56afe1","#c7e4f5"), name = "MYH3 copy") +
 # scale_fill_manual(values = c("#D16D6A", "#4B77D1", "#789DE6","#CCDAF5"), name = "MYH3 copy") +
  scale_y_continuous(expand = expansion(mult = c(0, 0)),
                     breaks = seq(0, 1, by = 0.1),
                     labels = percent) +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  theme_cowplot(7) +
#  panel_border(color="black", size=0.75) +
  theme(axis.text.x = element_blank(),
        axis.ticks.x=element_blank(),
        strip.background = element_rect(colour=NA, fill=NA),
        strip.placement = "outside",
        legend.position = "none") +
  ylab("Percent of total C3 expression") +
  xlab("Tissue") +
  facet_grid(cols=vars(Tissue_type), scales = "free_x", space= "free",
             labeller = label_wrap_gen(width = 2, multi_line = TRUE),
             switch = "x")
k_C3_plot

k %>% 
  filter(str_detect(MYH_copy, "C2")) %>% 
  ggplot(aes(ID, read_count, fill=allele)) +
  geom_bar(position="fill", stat="identity", color="black") +
  scale_fill_manual(values = c("#d73027", "#0072b2", "#56afe1","#c7e4f5"), name = "MYH3C2 copy") +
  scale_y_continuous(expand = expansion(mult = c(0, 0))) +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ylab("Proportion of expression") +
  facet_grid(cols=vars(Tissue_type, Temperature), scales = "free_x", space= "free")

k %>% 
  filter(str_detect(MYH_copy, "C2"), Temperature=="18C", Tissue_type=="Whole juvenile") %>% 
  ggplot(aes(ID, read_count, fill=allele)) +
  geom_bar(position="fill", stat="identity", color="black") +
  scale_fill_manual(values = c("#d73027", "#0072b2", "#56afe1","#c7e4f5"), name = "MYH3C2 copy") +
  scale_y_continuous(expand = expansion(mult = c(0, 0)),
                     breaks = seq(0, 1, by = 0.1)) +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  theme_cowplot(10) +
  panel_border(color="black", size=0.75) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ylab("Proportion of expression") +
  facet_grid(cols=vars(Tissue_type), scales = "free_x", space= "free")

Combine into final plot

row1 <- plot_grid(mce_means_plot, labels = c('B'), nrow=1)
d_panel <- plot_grid(NULL, k_C3_plot, ncol=1, rel_heights = c(0.2, 1))
row2 <- plot_grid(ase_means_plot, d_panel, labels = c('C','D'), nrow=1, rel_widths = c(1, 1))
final_plot <- plot_grid(row1, row2, label_size = 12, nrow=2)
final_plot

# Save final figure to separate file
ggsave(
  "/labs/kingsley/ambenj/myosin_dups/analysis/ase/figures/expression_plot_v2.pdf",
  plot = final_plot,
  width = 4.5,
  height = 4.5,
  units = c("in"),
  dpi = 300,
)
# Make supplemental figure
row1 <- plot_grid(mce_means_temp_plot, ase_means_temp_plot, labels = c('A', 'B'), nrow=1, rel_widths = c(2, 1))
row2 <- plot_grid(k_C3_temp_plot,NULL, labels = c('C'), nrow=1, rel_widths = c(3, 1.25))
final_plot <- plot_grid(row1, row2, label_size = 12, nrow=2, rel_heights = c(1.2, 1))
final_plot

# Save final figure to separate file
ggsave(
  "/labs/kingsley/ambenj/myosin_dups/analysis/ase/figures/temp_expression_supp_figure.pdf",
  plot = final_plot,
  width = 6.5,
  height = 6,
  units = c("in"),
  dpi = 300,
)
# Make final kmer table
k